Imports¶
In [1]:
import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.manifold import TSNE
from sklearn.decomposition import PCA
from ydata_profiling import ProfileReport
from sklearn.preprocessing import OrdinalEncoder
from scipy import stats
Load data¶
In [2]:
DATA = pd.read_csv('data/input.csv')
In [3]:
%matplotlib inline
profile = ProfileReport(DATA, title="report")
profile
Out[3]:
In [4]:
categorical_cols = DATA.select_dtypes(include=['object', 'category']).columns
categorical_cols = [c for c in categorical_cols if c != 'y']
numerical_cols = DATA.select_dtypes(include=['number']).columns
In [5]:
print('Columns categorical: ' + ', '.join(categorical_cols))
Columns categorical: Gender, family_history_with_overweight, FAVC, CAEC, SMOKE, SCC, CALC, MTRANS
In [6]:
print('Columns nuemrical: ' + ', '.join(numerical_cols))
Columns nuemrical: Age, FCVC, NCP, CH2O, FAF, TUE
In [7]:
DATA[numerical_cols].describe()
Out[7]:
| Age | FCVC | NCP | CH2O | FAF | TUE | |
|---|---|---|---|---|---|---|
| count | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 | 2111.000000 |
| mean | 24.312600 | 2.419043 | 2.685628 | 2.008011 | 1.010298 | 0.657866 |
| std | 6.345968 | 0.533927 | 0.778039 | 0.612953 | 0.850592 | 0.608927 |
| min | 14.000000 | 1.000000 | 1.000000 | 1.000000 | 0.000000 | 0.000000 |
| 25% | 19.947192 | 2.000000 | 2.658738 | 1.584812 | 0.124505 | 0.000000 |
| 50% | 22.777890 | 2.385502 | 3.000000 | 2.000000 | 1.000000 | 0.625350 |
| 75% | 26.000000 | 3.000000 | 3.000000 | 2.477420 | 1.666678 | 1.000000 |
| max | 61.000000 | 3.000000 | 4.000000 | 3.000000 | 3.000000 | 2.000000 |
In [8]:
def create_corr_matrix(data):
corr = data.corr()
fig, ax = plt.subplots()
im = ax.imshow(corr, cmap='coolwarm')
ax.set_xticks(np.arange(len(corr.columns)))
ax.set_yticks(np.arange(len(corr.columns)))
ax.set_xticklabels(corr.columns)
ax.set_yticklabels(corr.columns)
plt.setp(ax.get_xticklabels(), rotation=45, ha="right", rotation_mode="anchor")
for i in range(len(corr.columns)):
for j in range(len(corr.columns)):
text = ax.text(j, i, round(corr.iloc[i, j], 2),
ha="center", va="center", color="w")
plt.colorbar(im)
plt.show()
create_corr_matrix(DATA[numerical_cols])
In [9]:
for c in numerical_cols:
sns.displot(DATA, x=c, hue="y", kde=False, height=5, aspect=2)
plt.show()
In [10]:
def plot_dim_reduced(x, reductor, title, y_label):
x = x.drop(y_label, axis=1)
x = pd.DataFrame(reductor.fit_transform(x), columns=[f'{title}1', f'{title}2'])
x['y'] = DATA[y_label]
sns.lmplot(data=x, x=f'{title}1', y=f'{title}2', hue=y_label, fit_reg=False)
plt.show()
all_numerical_data = pd.DataFrame(OrdinalEncoder().fit_transform(DATA), columns=DATA.columns)
plot_dim_reduced(all_numerical_data, PCA(n_components=2), 'PCA', 'y')
plot_dim_reduced(all_numerical_data, TSNE(n_components=2, random_state=0), 'PCA', 'y')
In [11]:
pca = PCA(n_components=8).fit(all_numerical_data)
plt.plot(np.cumsum(pca.explained_variance_ratio_))
plt.xlabel('number of components')
plt.ylabel('cumulative explained variance')
plt.gcf().set_size_inches(7, 5)
In [12]:
n_df = DATA.select_dtypes(include='number')
encoder = OrdinalEncoder(handle_unknown='use_encoded_value', unknown_value=-1)
ys = encoder.fit_transform(DATA)[:,-1]
n_df['y'] = ys
In [13]:
n_df
Out[13]:
| Age | FCVC | NCP | CH2O | FAF | TUE | y | |
|---|---|---|---|---|---|---|---|
| 0 | 21.000000 | 2.0 | 3.0 | 2.000000 | 0.000000 | 1.000000 | 0.0 |
| 1 | 21.000000 | 3.0 | 3.0 | 3.000000 | 3.000000 | 0.000000 | 0.0 |
| 2 | 23.000000 | 2.0 | 3.0 | 2.000000 | 2.000000 | 1.000000 | 0.0 |
| 3 | 27.000000 | 3.0 | 3.0 | 2.000000 | 2.000000 | 0.000000 | 4.0 |
| 4 | 22.000000 | 2.0 | 1.0 | 2.000000 | 0.000000 | 0.000000 | 5.0 |
| ... | ... | ... | ... | ... | ... | ... | ... |
| 2106 | 20.976842 | 3.0 | 3.0 | 1.728139 | 1.676269 | 0.906247 | 3.0 |
| 2107 | 21.982942 | 3.0 | 3.0 | 2.005130 | 1.341390 | 0.599270 | 3.0 |
| 2108 | 22.524036 | 3.0 | 3.0 | 2.054193 | 1.414209 | 0.646288 | 3.0 |
| 2109 | 24.361936 | 3.0 | 3.0 | 2.852339 | 1.139107 | 0.586035 | 3.0 |
| 2110 | 23.664709 | 3.0 | 3.0 | 2.863513 | 1.026452 | 0.714137 | 3.0 |
2111 rows × 7 columns
In [14]:
sns.pairplot(n_df, hue='y')
Out[14]:
<seaborn.axisgrid.PairGrid at 0x713c0c3c5410>
In [15]:
# Outlier analysis
n_df_no_y = n_df.drop(['y'], axis=1)
z = np.abs(stats.zscore(n_df_no_y))
(z > 2).sum()
Out[15]:
Age 144 FCVC 82 NCP 243 CH2O 0 FAF 99 TUE 144 dtype: int64
In [16]:
n_df_no_y[z > 2]['Age'].dropna().sort_values()
Out[16]:
1571 37.056193
1488 37.063599
1607 37.084742
1515 37.186795
1107 37.205173
...
1158 55.022494
1088 55.137881
1013 55.246250
252 56.000000
133 61.000000
Name: Age, Length: 144, dtype: float64
In [17]:
n_df_no_y[z <= 2]['Age'].dropna().sort_values()
Out[17]:
415 14.000000
116 15.000000
276 16.000000
302 16.000000
309 16.000000
...
760 36.769646
1700 36.839761
368 37.000000
362 37.000000
387 37.000000
Name: Age, Length: 1967, dtype: float64
In [18]:
n_df_no_y[z > 2]['FCVC'].dropna().sort_values()
Out[18]:
23 1.000000
479 1.000000
473 1.000000
449 1.000000
419 1.000000
...
1006 1.317729
876 1.321028
1501 1.330700
1074 1.341380
575 1.344854
Name: FCVC, Length: 82, dtype: float64
In [19]:
n_df_no_y[z <= 2]['FCVC'].dropna().sort_values()
Out[19]:
1238 1.362441
1499 1.368978
1527 1.369529
1602 1.387489
1528 1.392665
...
417 3.000000
418 3.000000
420 3.000000
803 3.000000
2110 3.000000
Name: FCVC, Length: 2029, dtype: float64
In [20]:
n_df_no_y[z > 2]['NCP'].dropna().sort_values()
Out[20]:
4 1.000000
674 1.000000
675 1.000000
739 1.000000
741 1.000000
...
1556 1.109956
1319 1.114564
1424 1.116401
1780 1.120102
987 1.124977
Name: NCP, Length: 243, dtype: float64
In [21]:
n_df_no_y[z <= 2]['NCP'].dropna().sort_values()
Out[21]:
1437 1.130751
800 1.131695
1779 1.134042
1591 1.134321
1417 1.135278
...
702 4.000000
122 4.000000
397 4.000000
481 4.000000
420 4.000000
Name: NCP, Length: 1868, dtype: float64
In [22]:
n_df_no_y[z > 2]['FAF'].dropna().sort_values()
Out[22]:
1065 2.721646
927 2.724300
926 2.762711
579 2.784471
1424 2.787319
...
247 3.000000
241 3.000000
239 3.000000
218 3.000000
356 3.000000
Name: FAF, Length: 99, dtype: float64
In [23]:
n_df_no_y[z <= 2]['FAF'].dropna().sort_values()
Out[23]:
0 0.000000
440 0.000000
439 0.000000
1025 0.000000
436 0.000000
...
715 2.697949
1236 2.698874
775 2.707882
1018 2.708250
1319 2.710338
Name: FAF, Length: 2012, dtype: float64
In [24]:
n_df_no_y[z > 2]['TUE'].dropna().sort_values()
Out[24]:
1574 1.875683
1276 1.882539
639 1.884138
1105 1.886855
786 1.887386
...
350 2.000000
352 2.000000
354 2.000000
334 2.000000
484 2.000000
Name: TUE, Length: 144, dtype: float64
In [25]:
n_df_no_y[z <= 2]['TUE'].dropna().sort_values()
Out[25]:
1228 0.000000
476 0.000000
1618 0.000000
1025 0.000000
1619 0.000000
...
560 1.839862
696 1.843830
1367 1.865851
1526 1.875023
1548 1.875023
Name: TUE, Length: 1967, dtype: float64